In [2]:
import numpy as np
import MyML.EAC.full as myEacFull
import MyML.EAC.sparse as myEacSp
import MyML.EAC.eac as myEac
import MyML.EAC.eac_new as myNewEac
import MyML.cluster.K_Means3 as myKM
import MyML.helper.partition as myEacPart
import MyML.metrics.accuracy as myAcc
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
reload(myEacFull)
reload(myEacPart)
reload(myEacSp)
reload(myEac)
reload(myNewEac)
Out[2]:
In [3]:
# rules for picking kmin kmax
def rule1(n):
"""sqrt"""
k = [np.sqrt(n)/2, np.sqrt(n)]
k = map(np.ceil,k)
k = map(int, k)
return k
def rule2(n):
"""2sqrt"""
k = map(lambda x:x*2,rule1(n))
return k
def rule3(n, sk, th):
"""fixed s/k"""
k = [n * 1.0 / sk, th * n * 1.0 / sk]
k = map(np.ceil,k)
k = map(int, k)
return k
def rule4(n):
"""sk=sqrt/2,th=30%"""
return rule3(n, sk1(n), 1.3)
def rule5(n):
"""sk=300,th=30%"""
return rule3(n,300, 1.3)
# rules for picking number of samples per cluster
def sk1(n):
"""sqrt/2"""
return int(np.sqrt(n) / 2)
In [4]:
from sklearn.datasets import make_blobs
n_samples = 100000
data,gt = make_blobs(n_samples, centers=4)
%time data = data.astype(np.float32)
n_clusts = rule4(n_samples)
plt.plot(data[:,0], data[:,1], '.')
generator = myKM.K_Means()
generator._MAX_THREADS_BLOCK = 256
%time ensemble = myEacPart.generateEnsemble(data, generator, n_clusts,npartitions=100)
In [7]:
myEacSp._compute_max_assocs_from_ensemble(ensemble)*3
Out[7]:
In [8]:
%time fullEAC = myNewEac.EAC(n_samples, sparse=True, condensed=False, sparse_keep_degree=True)
fullEAC.sp_max_assocs_mode = "constant"
%time fullEAC.buildMatrix(ensemble)
%time spEAC = myNewEac.EAC(n_samples, sparse=True, condensed=True, sparse_keep_degree=True)
spEAC.sp_max_assocs_mode = "constant"
%time spEAC.buildMatrix(ensemble)
In [11]:
import seaborn as sns
sns.set_style("darkgrid")
In [12]:
sns.set_palette(sns.color_palette("deep", 6))
In [43]:
fig1 = plt.figure(figsize=(16,6))
ax = fig1.add_subplot(1,2,1)
x = np.arange(n_samples)
y = spEAC.coassoc.degree[:-1]
ax.plot(y,x, 'k', alpha=1)
ax.fill_between(y,0,x,color='k', alpha=1)
ax.plot([spEAC.sp_max_assocs, spEAC.sp_max_assocs], [0, n_samples], 'r', label="max. number of assocs.")
ax.set_title("Number of associations per sample on a condensed matrix")
ax.set_ylabel("Sample no.")
ax.set_xlabel("Number of associations")
plt.gca().invert_yaxis()
ax.legend(loc="best")
ax = fig1.add_subplot(1,2,2)
x = np.arange(n_samples)
y = fullEAC.coassoc.degree[:-1]
ax.plot(y,x, 'k', alpha=1)
ax.fill_between(y,0,x,color='k', alpha=1)
ax.plot([spEAC.sp_max_assocs, spEAC.sp_max_assocs], [0, n_samples], 'r', label="max. number of assocs.")
ax.set_title("Number of associations per sample on a condensed matrix")
ax.set_ylabel("Sample no.")
ax.set_xlabel("Number of associations")
plt.gca().invert_yaxis()
ax.legend(loc="best")
Out[43]:
In [45]:
fig_cond = plt.figure()
x = np.arange(n_samples)
y = spEAC.coassoc.degree[:-1]
plt.barh(x, y)
plt.plot([spEAC.sp_max_assocs, spEAC.sp_max_assocs], [0, n_samples], 'r', label="max. number of assocs.")
plt.title("Number of associations per sample on a condensed matrix")
plt.ylabel("Sample no.")
plt.xlabel("Number of associations")
plt.gca().invert_yaxis()
plt.legend(loc="best")
Out[45]:
In [13]:
from MyML.helper.plotting import save_fig
In [47]:
save_fig(fig_cond, "/home/chiroptera/eac_csr_cond_degree")
save_fig(fig_full, "/home/chiroptera/eac_csr_full_degree")
In [46]:
fig_full = plt.figure()
x = np.arange(n_samples)
y = fullEAC.coassoc.degree[:-1]
plt.barh(x, y)
plt.plot([spEAC.sp_max_assocs, spEAC.sp_max_assocs], [0, n_samples], 'r', label="max. number of assocs.")
plt.title("Number of associations per sample on a complete matrix")
plt.ylabel("Sample no.")
plt.xlabel("Number of associations")
plt.gca().invert_yaxis()
plt.legend(loc="best")
Out[46]:
In [7]:
%time fullEAC.finalClustering(n_clusters=0)
%time spEAC.finalClustering(n_clusters=0)
# b MyML/EAC/eac_new.py:606
# b MyML/EAC/eac_new.py:556
Out[7]:
In [100]:
histFig = plt.figure(figsize=(16,6))
ax = histFig.add_subplot(1,2,1)
ax.hist(fullEAC.labels,bins=fullEAC.n_fclusts)
ax.set_title("Full coassoc")
print "full bincount", np.bincount(fullEAC.labels)
ax = histFig.add_subplot(1,2,2)
ax.hist(spEAC.labels,bins=spEAC.n_fclusts)
ax.set_title("Sparse coassoc")
print "full bincount", np.bincount(spEAC.labels)
In [101]:
acc = myAcc.HungarianIndex(n_samples)
acc.score(gt, fullEAC.labels)
acc.accuracy
Out[101]:
In [102]:
acc = myAcc.HungarianIndex(n_samples)
acc.score(gt, spEAC.labels)
acc.accuracy
Out[102]:
In [103]:
acc2 = myAcc.ConsistencyIndex(N=n_samples)
acc2.score(gt, fullEAC.labels)
Out[103]:
In [104]:
acc2 = myAcc.ConsistencyIndex(N=n_samples)
acc2.score(gt, spEAC.labels)
Out[104]:
In [36]:
for c in xrange(fullEAC.n_fclusts):
c_idx = fullEAC.labels == c
plt.plot(data[c_idx,0], data[c_idx,1], '.')
In [37]:
for c in xrange(spEAC.n_fclusts):
c_idx = spEAC.labels == c
plt.plot(data[c_idx,0], data[c_idx,1], '.')
In [20]:
(fullEAC.coassoc.todense() == (spEAC.coassoc.todense() + spEAC.coassoc.todense().T)).all()
Out[20]:
In [14]:
%time full = myEacFull.EAC_FULL(n_samples, condensed=False)
%time full.update_ensemble(ensemble)
In [15]:
%time cfull = myEacFull.EAC_FULL(n_samples, condensed=True)
%time cfull.update_ensemble(ensemble)
In [16]:
(full.coassoc == cfull.todense() + ).all()
Out[16]:
In [17]:
max_assocs = myEacSp._compute_max_assocs_from_ensemble(ensemble) * 3
%time sp = myEacSp.EAC_CSR(n_samples, max_assocs, condensed=False, sort_mode="numpy")
%time sp.update_ensemble(ensemble)
In [18]:
max_assocs = myEacSp._compute_max_assocs_from_ensemble(ensemble) * 3
%time sp2 = myEacSp.EAC_CSR(n_samples, max_assocs, condensed=False, sort_mode="surgical")
%time sp2.update_ensemble(ensemble)
In [21]:
sp._condense(keep_degree=True)
sp2._condense(keep_degree=True)
print (sp.todense() == sp2.todense()).all()
print (sp.todense() == full.coassoc).all()
In [22]:
max_assocs = myEacSp._compute_max_assocs_from_ensemble(ensemble) * 3
%time csp = myEacSp.EAC_CSR(n_samples, max_assocs, condensed=True, sort_mode="numpy")
%time csp.update_ensemble(ensemble)
In [23]:
max_assocs = myEacSp._compute_max_assocs_from_ensemble(ensemble) * 3
%time csp2 = myEacSp.EAC_CSR(n_samples, max_assocs, condensed=True, sort_mode="surgical")
%time csp2.update_ensemble(ensemble)
In [24]:
csp._condense(keep_degree=True)
csp2._condense(keep_degree=True)
print "condensed numpy", ((csp.todense() + csp.todense().T) == full.coassoc).all()
print "condensed surgical", ((csp2.todense() + csp2.todense().T) == full.coassoc).all()
In [25]:
max_assocs = myEacSp._compute_max_assocs_from_ensemble(ensemble) * 3
%time csp2 = myEacSp.EAC_CSR(n_samples, max_assocs, max_assocs_type='linear', condensed=True, sort_mode="surgical")
%time csp2.update_ensemble(ensemble)
In [26]:
csp2._condense(keep_degree=True)
In [27]:
print "condensed surgical", ((csp2.todense() + csp2.todense().T) == full.coassoc).all()
In [4]:
import scipy.sparse.csgraph as csgraph
In [65]:
reload(csgraph)
Out[65]:
In [202]:
mst.indptr
Out[202]:
In [54]:
from scipy.sparse.csgraph._validation import validate_graph
In [63]:
c2 = validate_graph(coassoc, True, dtype=coassoc.dtype, dense_output=False)
In [64]:
c2
Out[64]:
In [13]:
import scipy_numba.sparse.csgraph._min_spanning_tree as myMST
In [53]:
reload(myMST)
In [23]:
coassoc2 = spEAC.coassoc.csr.copy()
coassoc2.data = coassoc2.data.max() + 1 - coassoc2.data
%time mst2 = myMST.minimum_spanning_tree(coassoc2)
In [39]:
coassoc = spEAC.coassoc.csr.copy()
coassoc.data = coassoc.data.max() + 1 - coassoc.data
In [24]:
coassoc = spEAC.coassoc.csr.copy()
coassoc.data = coassoc.data.max() + 1 - coassoc.data
%time mst = csgraph._min_spanning_tree.minimum_spanning_tree(coassoc)
In [27]:
mst = mst.astype(mst2.dtype)
In [42]:
coassoc_val = csgraph._validation.validate_graph(coassoc, True, dense_output=False)
In [52]:
coassoc_val.data
Out[52]:
In [51]:
coassoc.data[0]=95
In [46]:
coassoc_val.indices
Out[46]:
In [36]:
a = csgraph.connected_components(mst)
In [35]:
fig = plt.figure(figsize=(16,6))
ax = fig.add_subplot(1,2,1)
plt.hist(mst.data, bins=mst.max())
ax = fig.add_subplot(1,2,2)
plt.hist(mst2.data, bins=mst2.max())
Out[35]:
In [131]:
# inputs
n_clusters = 4
n_partitions = 100
In [132]:
# converting to diassociations
coassoc.data = coassoc.data.max() + 1 - coassoc.data
# get minimum spanning tree
mst = csgraph.minimum_spanning_tree(coassoc).astype(np.uint8)
# compute number of disconnected components
n_disconnect_clusters = mst.shape[0] - mst.nnz
# sort associations by weights
asort = mst.data.argsort()
sorted_weights = mst.data[asort]
if n_clusters == 0:
# compute lifetimes
lifetimes = sorted_weights[1:] - sorted_weights[:-1]
# add 1 to n_partitions as the maximum weight because I also added
# 1 when converting to diassoc to avoid having zero weights
disconnect_lifetime = n_partitions + 1 - sorted_weights[-1]
# maximum lifetime
m_index = np.argmax(lifetimes)
th = lifetimes[m_index]
# get number of clusters from lifetimes
indices = np.where(mst.data > mst.data[m_index])[0]
if indices.size == 0:
cont = 1
else:
cont = indices.size + 1
#testing the situation when only 1 cluster is present
# if maximum lifetime is smaller than 2 times the minimum
# don't make any cuts (= 1 cluster)
# max>2*min_interval -> nc=1
close_to_zero_indices = np.where(np.isclose(lifetimes, 0))
minimum = np.min(lifetimes[close_to_zero_indices])
if th < 2 * minimum:
cont = 1
# add disconnected clusters to number of clusters if disconnected
# lifetime is smaller
if th > disconnect_lifetime:
cont += n_disconnect_clusters
else:
cont = n_disconnect_clusters
nc_stable = cont
else:
nc_stable = n_clusters
n_comps, labels = csgraph.connected_components(mst)
In [133]:
print n_comps, labels
plt.hist(labels, bins=n_comps, log=True)
Out[133]:
In [134]:
for c in xrange(n_comps):
c_idx = labels == c
plt.plot(data[c_idx,0],data[c_idx,1],'.')
In [33]:
nc_stable
Out[33]:
In [66]:
nc_stable
Out[66]:
In [ ]:
if n_clusters == 0:
# lifetime is here computed as the distance difference between
# any two consecutive nodes, i.e. the distance between passing
# from n to n-1 clusters
lifetimes = Z[1:,2] - Z[:-1,2]
m_index = np.argmax(lifetimes)
# Z is ordered in increasing order by weight of connection
# the connection whose weight is higher than the one specified
# by m_index MUST be the one from which the jump originated the
# maximum lifetime; all connections above that (and including)
# will be removed for the final clustering
indices = np.where(Z[:,2] > Z[m_index, 2])[0]
#indices = np.arange(m_index+1, Z.shape[0])
if indices.size == 0:
cont = 1
else:
cont = indices.size + 1
# store maximum lifetime
th = lifetimes[m_index]
#testing the situation when only 1 cluster is present
# if maximum lifetime is smaller than 2 times the minimum
# don't make any cuts (= 1 cluster)
#max>2*min_interval -> nc=1
close_to_zero_indices = np.where(np.isclose(lifetimes, 0))
minimum = np.min(lifetimes[close_to_zero_indices])
if th < 2 * minimum:
cont = 1
nc_stable = cont
else:
nc_stable = n_clusters
if nc_stable > 1:
# only the labels are of interest
labels = labels_from_Z(Z, n_clusters=nc_stable)
# rename labels
i=0
for l in np.unique(labels):
labels[labels == l] = i
i += 1
else:
labels = np.zeros(self.n_samples, dtype = np.int32)
self.labels_ = labels
return labels
In [159]:
from scipy.cluster.hierarchy import linkage
In [204]:
coassoc = cfull.coassoc.copy()
In [207]:
myEac.make_diassoc_1d(coassoc, n_partitions)
# apply linkage
%time Z = myEac.linkage(coassoc, method="average")
In [208]:
Z
Out[208]:
In [185]:
labels = myEac.labels_from_Z(Z, n_clusters=100)
# rename labels
i=0
for l in np.unique(labels):
labels[labels == l] = i
i += 1
In [209]:
plt.hist(Z[:,2], bins=Z.shape[0])
Out[209]:
In [186]:
print labels
plt.hist(labels, bins=100, log=True)
Out[186]:
In [14]:
fig1 = plt.figure(figsize=(16,6))
ax = fig1.add_subplot(1,2,1)
x = np.arange(n_samples)
y = fullEAC.coassoc.degree[:-1]
x_max = [0, n_samples]
y_max = [fullEAC.sp_max_assocs, fullEAC.sp_max_assocs]
ax.plot(x, y, 'k', alpha=1)
ax.fill_between(x, 0, y, color='k', alpha=1)
ax.plot(x_max, y_max, 'r', label="max. number of assocs.")
ax.set_title("Number of associations per sample on a condensed matrix")
ax.set_xlabel("Sample no.")
ax.set_ylabel("Number of associations")
#ax.legend(loc=4)
################################################
ax = fig1.add_subplot(1,2,2)
x = np.arange(n_samples)
y = spEAC.coassoc.degree[:-1]
lin_indptr = myEacSp.indptr_linear(n_samples, fullEAC.sp_max_assocs,
spEAC.coassoc.n_s, spEAC.coassoc.n_e,
spEAC.coassoc.val_s, spEAC.coassoc.val_e)
y_lin = lin_indptr[1:] - lin_indptr[:-1]
fit = np.polyfit(x,y,1) # linear regression of degree
fit_fn = np.poly1d(fit)
y_reg = fit_fn(x)
y_reg = np.where(y_reg<0,0,y_reg)
ax.plot(x, y, color='k')
ax.fill_between(x, 0, y, color='k', alpha=1)
ax.plot(x, y_reg, label="linear regression of no. of assocs.")
ax.plot(x, y_lin, label="pre-allocated no. assocs.")
ax.plot(x_max, y_max, label="max. number of assocs.")
ax.set_title("Number of associations per sample on a condensed matrix")
ax.set_xlabel("Sample no.")
ax.set_ylabel("Number of associations")
ax.legend(loc=[0.5,0.7])
Out[14]:
In [15]:
save_fig(fig1, "/home/chiroptera/eac_csr_cond_full_degree_100k")
In [54]:
x = np.arange(1,10,0.05)
y = 1 / np.sqrt(x)
y = y.max() -y
y = y / y.max()
plt.plot(x,y)
plt.plot(x,y[::-1])
#plt.plot(x,np.sqrt(x)[::-1]/np.sqrt(x).max())
Out[54]:
In [52]:
y.sum()
Out[52]:
In [15]:
x = np.arange(0,2,0.05)
y = np.exp(x)
y /= y.max()
y = 1-y
plt.plot(x,y)
Out[15]:
In [57]:
reload(myEacSp)
Out[57]:
In [60]:
y, y_sum = myEacSp.linear(n_samples, max_assocs, 0.1, 1, 1, 0.02)
In [61]:
total_area = n_samples * max_assocs
y_sum * 1.0 / total_area
Out[61]:
In [13]:
fig = plt.figure(figsize=(8,6))
x = np.arange(n_samples)
ax = fig.add_subplot(1,1,1)
ax.bar(x, csp.degree[:-1])
ax.plot([0,n_samples],[max_assocs, max_assocs])
#ax.plot(np.arange(n_samples), max_assoc_scheme)
ax.plot(x,y)
ax.set_title("full sparse matrix")
ax.set_xlabel("sample id")
Out[13]:
In [67]:
import MyML.helper.scan as myScan
reload(myScan)
Out[67]:
In [63]:
y_c = y.copy()
In [71]:
y_sum
Out[71]:
In [72]:
y_c
Out[72]:
In [68]:
myScan.exprefixsumNumbaSingle(y)
Out[68]:
In [77]:
class testme:
def __init__(self,**kwargs):
self.val2=kwargs.get("val2", 50)
In [80]:
a=testme(val2=100)
a.val2
Out[80]:
In [73]:
y
Out[73]:
In [76]:
((y[1:] - y[:-1]) == y_c[:-1]).all()
Out[76]: